{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create SparkContext, SparkSession instances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark import SparkConf, SparkContext\n",
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sc = SparkContext(conf=SparkConf())\n",
    "spark = SparkSession(sparkContext=sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read tabular data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mtcars = spark.read.csv('../../data/mtcars.csv', header=True, inferSchema=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|          _c0| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|\n",
      "+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|    Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|\n",
      "|Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|\n",
      "|   Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|\n",
      "+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars.show(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rename individual column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|    rown_ames| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|\n",
      "+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|    Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|\n",
      "|Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|\n",
      "|   Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|\n",
      "+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars = mtcars.withColumnRenamed('_c0', 'rown_ames')\n",
    "mtcars.show(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rename multple columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['x_rown_ames',\n",
       " 'x_mpg',\n",
       " 'x_cyl',\n",
       " 'x_disp',\n",
       " 'x_hp',\n",
       " 'x_drat',\n",
       " 'x_wt',\n",
       " 'x_qsec',\n",
       " 'x_vs',\n",
       " 'x_am',\n",
       " 'x_gear',\n",
       " 'x_carb']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_col_names = [ 'x_' + x for x in mtcars.columns]\n",
    "new_col_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+-----+-----+------+----+------+-----+------+----+----+------+------+\n",
      "|  x_rown_ames|x_mpg|x_cyl|x_disp|x_hp|x_drat| x_wt|x_qsec|x_vs|x_am|x_gear|x_carb|\n",
      "+-------------+-----+-----+------+----+------+-----+------+----+----+------+------+\n",
      "|    Mazda RX4| 21.0|    6| 160.0| 110|   3.9| 2.62| 16.46|   0|   1|     4|     4|\n",
      "|Mazda RX4 Wag| 21.0|    6| 160.0| 110|   3.9|2.875| 17.02|   0|   1|     4|     4|\n",
      "|   Datsun 710| 22.8|    4| 108.0|  93|  3.85| 2.32| 18.61|   1|   1|     4|     1|\n",
      "+-------------+-----+-----+------+----+------+-----+------+----+----+------+------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars = mtcars.rdd.toDF(new_col_names)\n",
    "mtcars.show(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read non-tabular data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Fresh install of XP on new computer. Sweet relief! fuck vista\\t1018769417\\t1.0',\n",
       " 'Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl\\t10284216536\\t1.0',\n",
       " '\"Literally six weeks before I can take off \"\"SSC Chair\"\" off my email. Its like the torturous 4th mile before everything stops hurting.\"\\t10298589026\\t1.0',\n",
       " 'Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!\\t109017669432377344\\t1.0',\n",
       " \"'Cheap Eats in SLP' - http://t.co/4w8gRp7\\t109642968603963392\\t1.0\"]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "twitter = sc.textFile('../../data/twitter.txt')\n",
    "twitter.take(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Export data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import DataFrameWriter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before we write the data into a file, we need to coalesce the data into one sinle partition. Otherwise, there will be multiple output files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mtcars = mtcars.coalesce(numPartitions=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mtcars.write.csv('data/saved-mtcars', header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twitter = twitter.coalesce(numPartitions=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twitter.saveAsTextFile('data/saved-twitter')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
